close all
clc

folder = "/Users/baffoeb/Desktop/Paper 5 works/A hybrid approach to innovation by social enterprises lessons from Africa.pdf";
folder1 = "/Users/baffoeb/Desktop/Paper 5 works/Contents and Determinants of Corporate Social Responsibility Website Reporting in Sub-Saharan Africa A Seven-Country Study.pdf";
folder2 = "/Users/baffoeb/Desktop/Paper 5 works/Corporate engagement in humanitarian action Concepts, challenges, and areas for international business research.pdf";
folder3 = "/Users/baffoeb/Desktop/Paper 5 works/The coordination roles of relief organisations in humanitarian logistics-jensen2016.pdf";
folder4 = "/Users/baffoeb/Desktop/Paper 5 works/Worlds Apart But Much Alike Donor Funding and the Homogenization of NGOs in Ghana and Indonesia.pdf";
fds = fileDatastore(folder,'ReadFcn',@extractFileText);
fds1 = fileDatastore(folder1,'ReadFcn',@extractFileText);
fds2 = fileDatastore(folder2,'ReadFcn',@extractFileText);
fds3 = fileDatastore(folder3,'ReadFcn',@extractFileText);
fds4 = fileDatastore(folder4,'ReadFcn',@extractFileText);
 str = [];
 while hasdata(fds)
     textData = read(fds);
     str = [str; textData];
 end
 str;
 
  str1 = [];
 while hasdata(fds1)
     textData = read(fds1);
     str1 = [str1; textData];
 end
 str1;
 
  str2 = [];
 while hasdata(fds2)
     textData = read(fds2);
     str2 = [str2; textData];
 end
 str2;
 
  str3 = [];
 while hasdata(fds3)
     textData = read(fds3);
     str3 = [str3; textData];
 end
 str3;
 
  str4 = [];
 while hasdata(fds4)
     textData = read(fds4);
     str4 = [str4; textData];
 end
 str4;
 
cleanedDocuments = tokenizedDocument(str);
cleanedDocuments(1)
cleanedDocuments = addPartOfSpeechDetails(cleanedDocuments);
cleanedDocuments = removeStopWords(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = normalizeWords(cleanedDocuments,'Style','lemma');
cleanedDocuments(1)
cleanedDocuments = erasePunctuation(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = removeShortWords(cleanedDocuments,2);
cleanedDocuments = removeLongWords(cleanedDocuments,15);
cleanedDocuments(1)
cleanedBag = bagOfWords(cleanedDocuments);
cleanedBag = removeInfrequentWords(cleanedBag,2);
[cleanedBag,idx] = removeEmptyDocuments(cleanedBag);
str(idx) = [];
cleanedBag;

rawDocuments = tokenizedDocument(str);
rawBag =bagOfWords(rawDocuments);

numWordsCleaned = cleanedBag.NumWords;
numWordsRaw = rawBag.NumWords;
reduction= 1-numWordsCleaned/numWordsRaw;

cleanedDocuments = tokenizedDocument(str1);
cleanedDocuments(1)
cleanedDocuments = addPartOfSpeechDetails(cleanedDocuments);
cleanedDocuments = removeStopWords(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = normalizeWords(cleanedDocuments,'Style','lemma');
cleanedDocuments(1)
cleanedDocuments = erasePunctuation(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = removeShortWords(cleanedDocuments,2);
cleanedDocuments = removeLongWords(cleanedDocuments,15);
cleanedDocuments(1)
cleanedBag1 = bagOfWords(cleanedDocuments);
cleanedBag1 = removeInfrequentWords(cleanedBag1,2);
[cleanedBag1,idx] = removeEmptyDocuments(cleanedBag1);
str1(idx) = [];
cleanedBag1;

rawDocuments = tokenizedDocument(str1);
rawBag1 =bagOfWords(rawDocuments);

numWordsCleaned = cleanedBag1.NumWords;
numWordsRaw = rawBag1.NumWords;
reduction= 1-numWordsCleaned/numWordsRaw;

cleanedDocuments = tokenizedDocument(str2);
cleanedDocuments(1)
cleanedDocuments = addPartOfSpeechDetails(cleanedDocuments);
cleanedDocuments = removeStopWords(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = normalizeWords(cleanedDocuments,'Style','lemma');
cleanedDocuments(1)
cleanedDocuments = erasePunctuation(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = removeShortWords(cleanedDocuments,2);
cleanedDocuments = removeLongWords(cleanedDocuments,15);
cleanedDocuments(1)
cleanedBag2 = bagOfWords(cleanedDocuments);
cleanedBag2 = removeInfrequentWords(cleanedBag2,2);
[cleanedBag2,idx] = removeEmptyDocuments(cleanedBag2);
str2(idx) = [];
cleanedBag2;

rawDocuments = tokenizedDocument(str2);
rawBag2 =bagOfWords(rawDocuments);

numWordsCleaned = cleanedBag2.NumWords;
numWordsRaw = rawBag2.NumWords;
reduction= 1-numWordsCleaned/numWordsRaw;

cleanedDocuments = tokenizedDocument(str3);
cleanedDocuments(1)
cleanedDocuments = addPartOfSpeechDetails(cleanedDocuments);
cleanedDocuments = removeStopWords(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = normalizeWords(cleanedDocuments,'Style','lemma');
cleanedDocuments(1)
cleanedDocuments = erasePunctuation(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = removeShortWords(cleanedDocuments,2);
cleanedDocuments = removeLongWords(cleanedDocuments,15);
cleanedDocuments(1)
cleanedBag3 = bagOfWords(cleanedDocuments);
cleanedBag3 = removeInfrequentWords(cleanedBag3,2);
[cleanedBag3,idx] = removeEmptyDocuments(cleanedBag3);
str3(idx) = [];
cleanedBag3;

rawDocuments = tokenizedDocument(str3);
rawBag3 =bagOfWords(rawDocuments);

numWordsCleaned = cleanedBag3.NumWords;
numWordsRaw = rawBag3.NumWords;
reduction= 1-numWordsCleaned/numWordsRaw;

cleanedDocuments = tokenizedDocument(str4);
cleanedDocuments(1)
cleanedDocuments = addPartOfSpeechDetails(cleanedDocuments);
cleanedDocuments = removeStopWords(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = normalizeWords(cleanedDocuments,'Style','lemma');
cleanedDocuments(1)
cleanedDocuments = erasePunctuation(cleanedDocuments);
cleanedDocuments(1)
cleanedDocuments = removeShortWords(cleanedDocuments,2);
cleanedDocuments = removeLongWords(cleanedDocuments,15);
cleanedDocuments(1)
cleanedBag4 = bagOfWords(cleanedDocuments);
cleanedBag4 = removeInfrequentWords(cleanedBag4,2);
[cleanedBag4,idx] = removeEmptyDocuments(cleanedBag4);
str4(idx) = [];
cleanedBag4;

rawDocuments = tokenizedDocument(str4);
rawBag4 =bagOfWords(rawDocuments);

numWordsCleaned = cleanedBag4.NumWords;
numWordsRaw = rawBag4.NumWords;
reduction= 1-numWordsCleaned/numWordsRaw;

figure
subplot(1,5,1)
wordcloud(rawBag);
title("Raw Data for Paper 1")
subplot(1,5,2)
wordcloud(rawBag1);
title("Raw Data Data for Paper 2")
subplot(1,5,3)
wordcloud(rawBag2);
title("Raw Data Data for Paper 3")
subplot(1,5,4)
wordcloud(rawBag3);
title("Raw Data Data for Paper 4")
subplot(1,5,5)
wordcloud(rawBag4);
title("Raw Data Data for Paper 5")
subplot(1,5,1)
wordcloud(cleanedBag);
title("Wordcloud for Paper 1 Cleaned Data")
subplot(1,5,2)
wordcloud(cleanedBag1);
title("Wordcloud for Paper 2 Cleaned Data")
subplot(1,5,3)
wordcloud(cleanedBag2);
title("Wordcloud for Paper 3 Cleaned Data")
subplot(1,5,4)
wordcloud(cleanedBag3);
title("Wordcloud for Paper 4 Cleaned Data")
subplot(1,5,5)
wordcloud(cleanedBag4);
title("Wordcloud for Paper 5 Cleaned Data")
